###########################################################################
## Challenging Encounters and Within-Physician Practice Variability
## Script 3- Estimate the propensity score of testing
## Last update: 29-1-23
###########################################################################

## Setup
###########################################################################

# Packages
library(xtable)  
library(dplyr)   
library(readxl)
library(lmtest) 
library(lubridate)
library(xgboost)

# Directory
wd <- "."    
setwd(wd)                                              
pd <- "./project"

###########################################################################

## Sample for the PS estimation
###########################################################################

# Import primary care visits
Visits<-readRDS(file.path("R_Mac","R_files","Visits"))
Visits<-Visits[Visits$specialization_code==10,]

# Keep only full-time physicians
phys<-summarise(group_by(Visits,month(Visits$date),year(Visits$date),doctor_id),tot=n())
phys<-phys[phys$tot>100,]
phys<-summarise(group_by(phys,doctor_id),months=n(),mean_visits_mon=mean(tot))
phys<-phys[phys$months>12,]
Visits<-merge(Visits,phys[,c("doctor_id")],by="doctor_id")

# Add patient characteristics
pat<-readRDS(file.path("R_Mac","R_files","Patients"))
pat<-pat[,c(1,2,3,4,10,17:31)]
Data<-merge(pat,Visits,by="patient_id")
Visits<-NULL
gc()

# Keep a random subset of 1M visits
set.seed(19)
sam <- sample(length(Data$patient_id),10^6)
Data <- Data[sam,]
gc()

# Patient information
Data$age<-year(Data$date)-Data$birth_date
Data$TIA<-difftime(as.Date(Data$date_from_tia),Data$time)<0
Data$Diabetic<-difftime(as.Date(Data$date_from_diabetic),Data$time)<0
Data$CVD<-difftime(as.Date(Data$date_from_cvd),Data$time)<0
Data$Hashmana<-difftime(as.Date(Data$date_from_hashmana),Data$time)<0
Data$Cancer_Old<-difftime(as.Date(Data$date_from_cancer),Data$time)<0
Data$Blood_pressure<-difftime(as.Date(Data$date_from_bloodpresure),Data$time)<0
Data$Dializa<-difftime(as.Date(Data$date_from_dializa),Data$time)<0
Data$COPD<-difftime(as.Date(Data$date_from_copd),Data$time)<0
Data$Osteo<-difftime(as.Date(Data$date_from_osteo),Data$time)<0
Data$CHF<-difftime(as.Date(Data$date_from_chf),Data$time)<0
Data$CKD<-difftime(as.Date(Data$date_from_ckd),Data$time)<0
Data$Fertility<-difftime(as.Date(Data$date_from_fertility),Data$time)<0
Data$Cardio<-difftime(as.Date(Data$date_from_cardio),Data$time)<0

Data$TIA[year(Data$date_from_tia)==1800]<-0
Data$Diabetic[year(Data$date_from_diabetic)==1800]<-0
Data$CVD[year(Data$date_from_cvd)==1800]<-0
Data$Hashmana[year(Data$date_from_hashmana)==1800]<-0
Data$Cancer_Old[year(Data$date_from_cancer)==1800]<-0
Data$Blood_pressure[year(Data$date_from_bloodpresure)==1800]<-0
Data$Dializa[year(Data$date_from_dializa)==1800]<-0
Data$COPD[year(Data$date_from_copd)==1800]<-0
Data$Osteo[year(Data$date_from_osteo)==1800]<-0
Data$CHF[year(Data$date_from_chf)==1800]<-0
Data$CKD[year(Data$date_from_ckd)==1800]<-0
Data$Fertility[year(Data$date_from_fertility)==1800]<-0
Data$Cardio[year(Data$date_from_cardio)==1800]<-0

Data$TIA_Recent<-difftime(as.Date(Data$date_from_tia),Data$time)<0 & difftime(as.Date(Data$date_from_tia),Data$time)>(-181)
Data$Diabetic_Recent<-difftime(as.Date(Data$date_from_diabetic),Data$time)<0 & difftime(as.Date(Data$date_from_diabetic),Data$time)>(-181)
Data$CVD_Recent<-difftime(as.Date(Data$date_from_cvd),Data$time)<0 & difftime(as.Date(Data$date_from_cvd),Data$time)>(-181)
Data$Hashmana_Recent<-difftime(as.Date(Data$date_from_hashmana),Data$time)<0 & difftime(as.Date(Data$date_from_hashmana),Data$time)>(-181)
Data$Cancer_Old_Recent<-difftime(as.Date(Data$date_from_cancer),Data$time)<0 & difftime(as.Date(Data$date_from_cancer),Data$time)>(-181)
Data$Blood_pressure_Recent<-difftime(as.Date(Data$date_from_bloodpresure),Data$time)<0 & difftime(as.Date(Data$date_from_bloodpresure),Data$time)>(-181)
Data$Dializa_Recent<-difftime(as.Date(Data$date_from_dializa),Data$time)<0 & difftime(as.Date(Data$date_from_dializa),Data$time)>(-181)
Data$COPD_Recent<-difftime(as.Date(Data$date_from_copd),Data$time)<0 & difftime(as.Date(Data$date_from_copd),Data$time)>(-181)
Data$Osteo_Recent<-difftime(as.Date(Data$date_from_osteo),Data$time)<0 & difftime(as.Date(Data$date_from_osteo),Data$time)>(-181)
Data$CHF_Recent<-difftime(as.Date(Data$date_from_chf),Data$time)<0 & difftime(as.Date(Data$date_from_chf),Data$time)>(-181)
Data$CKD_Recent<-difftime(as.Date(Data$date_from_ckd),Data$time)<0 & difftime(as.Date(Data$date_from_ckd),Data$time)>(-181)
Data$Fertility_Recent<-difftime(as.Date(Data$date_from_fertility),Data$time)<0 & difftime(as.Date(Data$date_from_fertility),Data$time)>(-181)
Data$Cardio_Recent<-difftime(as.Date(Data$date_from_cardio),Data$time)<0 & difftime(as.Date(Data$date_from_cardio),Data$time)>(-181)


# Keep relevant columns
Data<-Data[,c(1,3:5,21:23,26:53)]
gc()

# Visit timing information
Data$weekday<-weekdays(Data$date)
Data$yearweek<-week(Data$date)
Data$year<-year(Data$date)
Data<-mutate(group_by(Data,date,doctor_id),num_visit=1:n())
Data$Hour<-substr(Data$time,12,13)
Data$Minutes<-substr(Data$time,15,16)
Data$Time_Fixed<-as.numeric(Data$Hour)+as.numeric(Data$Minutes)/60

# Save
Data<-Data[order(Data$doctor_id,Data$date),]
saveRDS(Data,file.path(pd,"data","data_for_ps_estimation_1"))
Data<-NULL
pat<-NULL
gc()
###########################################################################

## Add visits and hospitalizations information
###########################################################################

# Visits
Vis<-readRDS(file.path("R_Mac","R_files","Visits"))

# Mark PCP's visits 
Vis$Family<-as.numeric(Vis$specialization_code==10)

## Summarise number of visits weekly by patient
Vis<-summarise(group_by(Vis,patient_id,year(date),week(date)),
               total=n(),
               PCP=sum(Family))

## Count visits in the past 6-3-1 months
for(y in 2012:2015){
  for(w in 1:53){
    # a list of all patients visited in this week
    Temp<-Vis[Vis$`year(date)`==y&Vis$`week(date)`==w,1:4]
    Temp <- distinct(Temp,patient_id,`year(date)`,`week(date)`,.keep_all = T)
    
    # 6 months before this week 
    Visits<-Vis[(Vis$`year(date)`==y&Vis$`week(date)`>(w-25)&Vis$`week(date)`<w)|
                  (Vis$`year(date)`==y-1&Vis$`week(date)`>(27+w)),]
    
    # Mark visits within 1/3 months
    Visits$month_1 <- (Visits$`year(date)`==y&Visits$`week(date)`>(w-5)&Visits$`week(date)`<w)|
      (Visits$`year(date)`==y-1&Visits$`week(date)`>(48+w))
    Visits$month_3<-(Visits$`year(date)`==y&Visits$`week(date)`>(w-13)&Visits$`week(date)`<w)|
      (Visits$`year(date)`==y-1&Visits$`week(date)`>(40+w))
    
    # Summarise
    Visits<-summarise(group_by(Visits,patient_id),
                      Visits_Last_6_Months=sum(total),
                      Visits_Last_3_Months=sum(month_3*total),
                      Visits_Last_1_Months=sum(month_1*total),
                      Visits_Last_6_Months_PCP=sum(PCP),
                      Visits_Last_3_Months_PCP=sum(month_3*PCP),
                      Visits_Last_1_Months_PCP=sum(month_1*PCP))
    
    # Add to a new data object
    Temp<-merge(Temp,Visits,by=c("patient_id"),all.x=T)
    
    Temp$Visits_Last_6_Months[is.na(Temp$Visits_Last_6_Months)]<-0 
    Temp$Visits_Last_3_Months[is.na(Temp$Visits_Last_3_Months)]<-0 
    Temp$Visits_Last_1_Months[is.na(Temp$Visits_Last_1_Months)]<-0 
    
    Temp$Visits_Last_6_Months_PCP[is.na(Temp$Visits_Last_6_Months_PCP)]<-0 
    Temp$Visits_Last_3_Months_PCP[is.na(Temp$Visits_Last_3_Months_PCP)]<-0 
    Temp$Visits_Last_1_Months_PCP[is.na(Temp$Visits_Last_1_Months_PCP)]<-0 
    
    if(y==2012&w==1){X<-Temp}else{X<-rbind(X,Temp)}
  }
}

# Hospitalizations
Hos<-readRDS(file.path("R_Mac","R_files","Hospitalizations"))
Hos$hosp_length<-as.numeric(Hos$hosp_length)
Hos<-summarise(group_by(Hos,patient_id,year(entrance_date),week(entrance_date)),
               total=n(),
               Days=sum(hosp_length))
Hos$Days[Hos$Days>7] <- 7

for(y in 2012:2015){
  for(w in 1:53){
    Temp<-Hos[Hos$`year(entrance_date)`==y&Hos$`week(entrance_date)`==w,]
    Temp <- distinct(Temp,patient_id,`year(entrance_date)`,`week(entrance_date)`,.keep_all = T)
    
    Hosp<-Hos[(Hos$`year(entrance_date)`==y&Hos$`week(entrance_date)`>(w-25)&Hos$`week(entrance_date)`<w)|
                (Hos$`year(entrance_date)`==y-1&Hos$`week(entrance_date)`>(27+w)),]
    
    Hosp<-summarise(group_by(Hosp,patient_id),Hospitalizations_Last_6_Months=n()
                    ,Days_Hospitalizations_Last_6_Months=sum(Days))
    
    Temp<-merge(Temp,Hosp,by="patient_id",all.x=T)
    Temp$Hospitalizations_Last_6_Months[is.na(Temp$Hospitalizations_Last_6_Months)]<-0 
    Temp$Days_Hospitalizations_Last_6_Months[is.na(Temp$Days_Hospitalizations_Last_6_Months)]<-0 
    if(y==2012&w==1){X2<-Temp}else{X2<-rbind(X2,Temp)}
  }
}

Vis<-NULL
Hos<-NULL
Hosp<-NULL
Visits<-NULL
gc()

# Read sample data
Visits<-readRDS(file.path(pd,"data","data_for_ps_estimation_1"))
Visits$year<-year(Visits$date)
Visits$month<-month(Visits$date)

# Add visits & hospitalization information
colnames(X)[2:3]<-c("year","yearweek")
colnames(X2)[2:3]<-c("year","yearweek")

X2 <- X2[,c("patient_id","year","yearweek","Hospitalizations_Last_6_Months","Days_Hospitalizations_Last_6_Months")]
X <- X[,c("patient_id","year","yearweek","Visits_Last_6_Months","Visits_Last_3_Months",
          "Visits_Last_1_Months","Visits_Last_6_Months_PCP","Visits_Last_3_Months_PCP",
          "Visits_Last_1_Months_PCP")]
Visits<-merge(Visits,X,by=c("patient_id","year","yearweek"),all.x=T)
Visits<-merge(Visits,X2,by=c("patient_id","year","yearweek"),all.x=T)

Visits$Visits_Last_6_Months[is.na(Visits$Visits_Last_6_Months)]<-0
Visits$Visits_Last_3_Months[is.na(Visits$Visits_Last_3_Months)]<-0
Visits$Visits_Last_1_Months[is.na(Visits$Visits_Last_1_Months)]<-0
Visits$Visits_Last_6_Months_PCP[is.na(Visits$Visits_Last_6_Months_PCP)]<-0
Visits$Visits_Last_3_Months_PCP[is.na(Visits$Visits_Last_3_Months_PCP)]<-0
Visits$Visits_Last_1_Months_PCP[is.na(Visits$Visits_Last_1_Months_PCP)]<-0

Visits$Hospitalizations_Last_6_Months[is.na(Visits$Hospitalizations_Last_6_Months)]<-0
Visits$Days_Hospitalizations_Last_6_Months[is.na(Visits$Days_Hospitalizations_Last_6_Months)]<-0

# Save
saveRDS(Visits,file.path(pd,"data","data_for_ps_estimation_2"))
saveRDS(X,file.path(pd,"data","visits_data_for_ps_estimation"))
saveRDS(X2,file.path(pd,"data","hospitalizations_data_for_ps_estimation"))
Visits<-NULL
gc()

###########################################################################

## Add tests/referrals/prescriptions information
###########################################################################

# # Import
# sample<-readRDS(file.path(pd,"data","data_for_ps_estimation_2"))[,c("patient_id","date","doctor_id")]
# sample$doctor_id<-as.numeric(sample$doctor_id)

# Lab tests
Lab<-readRDS(file.path("R_Mac","R_files","Lab_Referrals"))[,c("patient_id","date","doctor_id","lab_code","lab_desc")]
Lab<-merge(Lab,sample,by=c("patient_id","date","doctor_id"))
#sample<-NULL
gc()
Lab$lab_code[grepl("(B)",Lab$lab_desc,fixed=T)]<-5022
Lab$lab_code[grepl("Blood",Lab$lab_desc,fixed=T)]<-5022
Lab$lab_code[grepl("Biopsy",Lab$lab_desc,fixed=T)]<-8300
Lab<-summarise(group_by(Lab,patient_id,date,doctor_id,lab_code),total=n(),lab_desc=getmode(lab_desc))
Lab$Blood<-Lab$lab_code==5022
Lab$Urine<-Lab$lab_code==1000
Lab$Cholesterol<-Lab$lab_code==3718|Lab$lab_code==3719|Lab$lab_code==2465
Lab$Triglycerides<-Lab$lab_code==4478
Lab$Phosphatase<-Lab$lab_code==4075
Lab$TSH<-Lab$lab_code==4443
Lab$Alt_GPT<-Lab$lab_code==4460
Lab$VitaminB12<-Lab$lab_code==2607
Lab$AST_GOT<-Lab$lab_code==4450
Lab$Ferritin<-Lab$lab_code==2728
Lab$Biopsy<-grepl("Biopsy",Lab$lab_desc,fixed=T)
Lab$PSA<-Lab$lab_code==4153
Lab$Occult_Blood<-Lab$lab_code==22731

Lab<-summarise(group_by(Lab,patient_id,date,doctor_id),lab_tests=sum(total),
               lab_codes=n(),lab_dups=sum(total>1),
               Alt_GPT=sum(Alt_GPT>0),
               VitaminB12=sum(VitaminB12>0),
               AST_GOT=sum(AST_GOT>0),
               Ferritin=sum(Ferritin>0),
               Blood=sum(Blood>0),
               Urine=sum(Urine>0),
               Cholesterol=sum(Cholesterol>0),
               Triglycerides=sum(Triglycerides>0),
               Phosphatase=sum(Phosphatase>0),
               TSH=sum(TSH>0),
               Biopsy=sum(Biopsy>0),
               PSA=sum(PSA>0),
               Occult_Blood=sum(Occult_Blood>0))

saveRDS(Lab,file.path(pd,"data","lab_tests_for_ps_estimation"))

Lab<-NULL
X <- NULL
X2 <- NULL
gc()


# Referrals
Refs<-readRDS(file.path("R_Mac","R_files","Referrals"))[,c("patient_id","date","doctor_id","ref_code","ref_desc")]
Refs<-merge(Refs,sample,by=c("patient_id","date","doctor_id"))
#sample<-NULL
gc()
Refs<-summarise(group_by(Refs,patient_id,date,doctor_id,ref_code),total=n(),ref_desc=getmode(ref_desc))
Refs$Dimut<-Refs$ref_code==99939
Refs$Rentgen<-Refs$ref_code==99942
Refs$Ultrasound<-Refs$ref_code==99935
Refs$Density<-Refs$ref_code==99926
Refs$Dufler<-Refs$ref_code==93320
Refs$Heart_Eco<-Refs$ref_code==99884
Refs$ER<-Refs$ref_code==99865
Refs$Emer<-Refs$ref_code==99866
Refs$Mammo<-Refs$ref_code==99936
Refs$EKG<-Refs$ref_code==93000
Refs$Heart_Machon<-Refs$ref_code==99923
Refs$Machon<-Refs$ref_code==99919
Refs$Consult<-grepl("CON",Refs$ref_code,fixed=T)
Refs$Gastro<-Refs$ref_code==99921

Refs<-summarise(group_by(Refs,date,patient_id,doctor_id),Referrals=sum(total),
                ref_codes=n(),ref_dups=sum(total>0),
                Consult=sum(Consult),
                ER=sum(ER),
                Emer=sum(Emer),
                Dimut=sum(Dimut),
                Ultrasound=sum(Ultrasound),
                Rentgen=sum(Rentgen),
                Density=sum(Density),
                Dufler=sum(Dufler),
                Heart_Eco=sum(Heart_Eco),
                Mammo=sum(Mammo),
                Machon=sum(Machon),
                EKG=sum(EKG),
                Heart_Machon=sum(Heart_Machon),
                Gastro=sum(Gastro))
saveRDS(Refs,file.path(pd,"data","referrals_for_ps_estimation"))

Refs<-NULL
gc()



# Prescriptions
Pres <- readRDS(file.path("R_Mac","R_files","Prescriptions"))[,c("patient_id","date_pres","doctor_id","largo_code")]
gc()
colnames(Pres)[2] <- "date"
Pres <- summarise(group_by(Pres,patient_id,year(date),week(date),largo_code),
                  total=n())
colnames(Pres)[2:3] <- c("year","week")
Pres <- summarise(group_by(Pres,patient_id,year,week),
                  Drugs=n(),
                  Drugs_count=sum(total))
saveRDS(Pres,file.path(pd,"data","prescriptions_for_ps_estimation"))
Pres<-NULL
gc()


# Merge everything
Data<-readRDS(file.path(pd,"data","data_for_ps_estimation_2"))
Lab<-readRDS(file.path(pd,"data","lab_tests_for_ps_estimation"))
Data<-merge(Data,Lab,by=colnames(Lab)[1:3],all.x=T)
Lab<-NULL
gc()
Ref<-readRDS(file.path(pd,"data","referrals_for_ps_estimation"))
Data<-merge(Data,Ref,by=colnames(Ref)[1:3],all.x=T)
Ref<-NULL
gc()

for(v in colnames(Data)[44:84]){
  print(v)
  print(mean(Data[[v]]))
  Data[[v]][is.na(Data[[v]])] <- 0
  print(mean(Data[[v]]))
  print("---")
}

## Add Outcomes
Data$Lab_Test<-(Data$Blood+Data$Cholesterol+Data$Triglycerides+Data$Phosphatase+Data$Alt_GPT)>0
Data$Screening_Test<-(Data$Machon+Data$Heart_Machon+Data$Dimut+Data$Ultrasound+Data$Rentgen)>0               
Data$Diagnostic_Test<-(Data$Lab_Test+Data$Screening_Test)>0
# Factor
Data$Diagnostic_Test<-as.factor(Data$Diagnostic_Test)

# Recent chronic conditions in shorter period
Data <- Data[c(1:51,87)]
Pat<-readRDS(file.path("R_Mac","R_files","Patients"))
Data <- merge(Data,Pat[,c(1,17:31)],by="patient_id")

Data$TIA_Recent_3<-difftime(as.Date(Data$date_from_tia),Data$time)<0 & difftime(as.Date(Data$date_from_tia),Data$time)>(-91)
Data$Diabetic_Recent_3<-difftime(as.Date(Data$date_from_diabetic),Data$time)<0 & difftime(as.Date(Data$date_from_diabetic),Data$time)>(-91)
Data$CVD_Recent_3<-difftime(as.Date(Data$date_from_cvd),Data$time)<0 & difftime(as.Date(Data$date_from_cvd),Data$time)>(-91)
Data$Hashmana_Recent_3<-difftime(as.Date(Data$date_from_hashmana),Data$time)<0 & difftime(as.Date(Data$date_from_hashmana),Data$time)>(-91)
Data$Cancer_Old_Recent_3<-difftime(as.Date(Data$date_from_cancer),Data$time)<0 & difftime(as.Date(Data$date_from_cancer),Data$time)>(-91)
Data$Blood_pressure_Recent_3<-difftime(as.Date(Data$date_from_bloodpresure),Data$time)<0 & difftime(as.Date(Data$date_from_bloodpresure),Data$time)>(-91)
Data$Dializa_Recent_3<-difftime(as.Date(Data$date_from_dializa),Data$time)<0 & difftime(as.Date(Data$date_from_dializa),Data$time)>(-91)
Data$COPD_Recent_3<-difftime(as.Date(Data$date_from_copd),Data$time)<0 & difftime(as.Date(Data$date_from_copd),Data$time)>(-91)
Data$Osteo_Recent_3<-difftime(as.Date(Data$date_from_osteo),Data$time)<0 & difftime(as.Date(Data$date_from_osteo),Data$time)>(-91)
Data$CHF_Recent_3<-difftime(as.Date(Data$date_from_chf),Data$time)<0 & difftime(as.Date(Data$date_from_chf),Data$time)>(-91)
Data$CKD_Recent_3<-difftime(as.Date(Data$date_from_ckd),Data$time)<0 & difftime(as.Date(Data$date_from_ckd),Data$time)>(-91)
Data$Fertility_Recent_3<-difftime(as.Date(Data$date_from_fertility),Data$time)<0 & difftime(as.Date(Data$date_from_fertility),Data$time)>(-91)
Data$Cardio_Recent_3<-difftime(as.Date(Data$date_from_cardio),Data$time)<0 & difftime(as.Date(Data$date_from_cardio),Data$time)>(-91)


# Save
saveRDS(Data,file.path(pd,"data","data_for_ps_estimation_3"))

sample<-NULL
Temp<-NULL
Vis<-NULL
gc()

###########################################################################

## Add Old Tests/Drugs variables
###########################################################################
# # Import
# Data<-readRDS(file.path(pd,"data","data_for_ps_estimation_3"))
 
# Tests
Lab <- readRDS(file.path(pd,"data","lab_tests_for_ps_estimation"))
Lab$Lab_Test <- (Lab$Blood+Lab$Cholesterol+Lab$Triglycerides+Lab$Phosphatase+Lab$Alt_GPT)>0
Lab$Lab_Tests <- (Lab$Blood+Lab$Cholesterol+Lab$Triglycerides+Lab$Phosphatase+Lab$Alt_GPT)
Lab$Lab_Tests_extended <- (Lab$VitaminB12+Lab$AST_GOT+Lab$TSH+Lab$Urine+Lab$Ferritin)
Lab$Lab_Tests_cancer_related <- (Lab$PSA+Lab$Biopsy)
Lab$year <- year(Lab$date)
Lab$week <- week(Lab$date)
Lab <- Lab[Lab$Lab_Tests > 0 |Lab$Lab_Tests_extended > 0 | Lab$Lab_Tests_cancer_related > 0, 
           c("patient_id","year","week","Lab_Tests","Lab_Tests_extended","Lab_Tests_cancer_related")]
Lab <- summarise_all(group_by(Lab,patient_id,year,week),sum)
gc()

# Referrals
Ref<-readRDS(file.path(pd,"data","referrals_for_ps_estimation"))
Ref$Screening_Test <- (Ref$Machon+Ref$Heart_Machon+Ref$Dimut+Ref$Ultrasound+Ref$Rentgen)>0               
Ref$Screening_Tests_extended <- (Ref$Consult+Ref$ER+Ref$Density+Ref$Dufler+Ref$EKG)  
Ref$Screening_Tests_cancer_related <- (Ref$Mammo+Ref$Gastro)  
Ref$Screening_Tests <- (Ref$Machon+Ref$Heart_Machon+Ref$Dimut+Ref$Ultrasound+Ref$Rentgen)              
Ref$year <- year(Ref$date)
Ref$week <- week(Ref$date)
Ref <- Ref[Ref$Screening_Tests>0 | Ref$Screening_Tests_extended>0 | Ref$Screening_Tests_cancer_related>0,
           c("patient_id","year","week","Screening_Tests","Screening_Tests_extended","Screening_Tests_cancer_related")]
Ref <- summarise_all(group_by(Ref,patient_id,year,week),sum)

# Prescriptions
Pres<-readRDS(file.path(pd,"data","prescriptions_for_ps_estimation"))

# Count old tests/referrals/prescriptions for every patient-month
for(y in 2012:2015){
  for(w in 1:53){
    
    Temp<-Data[Data$year==y&Data$yearweek==w,c("patient_id","year","yearweek")]
    Temp <- distinct(Temp,patient_id,year,yearweek)
    # 6 months before this week 
    L<-Lab[(Lab$year==y&Lab$week>(w-25)&Lab$week<w)|
             (Lab$year==y-1&Lab$week>(27+w)),]
    
    R<-Ref[(Ref$year==y&Ref$week>(w-25)&Ref$week<w)|
             (Ref$year==y-1&Ref$week>(27+w)),]
    
    P<-Pres[(Pres$year==y&Pres$week>(w-25)&Pres$week<w)|
             (Pres$year==y-1&Pres$week>(27+w)),]
    
    
    # Mark different periods
    L$month_1 <- (L$year==y&L$week>(w-5)&L$week<w)|
      (L$year==y-1&L$week>(48+w))
    L$month_3<-(L$year==y&L$week>(w-13)&L$week<w)|
      (L$year==y-1&L$week>(40+w))
    
    R$month_1 <- (R$year==y&R$week>(w-5)&R$week<w)|
      (R$year==y-1&R$week>(48+w))
    R$month_3<-(R$year==y&R$week>(w-13)&R$week<w)|
      (R$year==y-1&R$week>(40+w))
    
    P$month_1 <- (P$year==y&P$week>(w-5)&P$`week`<w)|
      (P$`year`==y-1&P$`week`>(48+w))
    P$month_3<-(P$`year`==y&P$`week`>(w-13)&P$`week`<w)|
      (P$`year`==y-1&P$`week`>(40+w))
    
    
    L<-summarise(group_by(L,patient_id),
                 Lab_Tests_Last_6_Months=sum(Lab_Tests),
                 Lab_Tests_Ex_Last_6_Months=sum(Lab_Tests_extended),
                 Lab_Tests_Ca_Last_6_Months=sum(Lab_Tests_cancer_related),
                 Lab_Tests_Last_3_Months=sum(month_3*Lab_Tests),
                 Lab_Tests_Ex_Last_3_Months=sum(month_3*Lab_Tests_extended),
                 Lab_Tests_Ca_Last_3_Months=sum(month_3*Lab_Tests_cancer_related),
                 Lab_Tests_Last_1_Months=sum(month_1*Lab_Tests),
                 Lab_Tests_Ex_Last_1_Months=sum(month_1*Lab_Tests_extended),
                 Lab_Tests_Ca_Last_1_Months=sum(month_1*Lab_Tests_cancer_related))
    
    R<-summarise(group_by(R,patient_id),
                 Ref_Tests_Last_6_Months=sum(Screening_Tests),
                 Ref_Tests_Last_3_Months=sum(month_3*Screening_Tests),
                 Ref_Tests_Last_1_Months=sum(month_1*Screening_Tests),
                 Ref_Tests_Last_Ex_6_Months=sum(Screening_Tests_extended),
                 Ref_Tests_Last_Ex_3_Months=sum(month_3*Screening_Tests_extended),
                 Ref_Tests_Last_Ex_1_Months=sum(month_1*Screening_Tests_extended),
                 Ref_Tests_Last_Ca_6_Months=sum(Screening_Tests_cancer_related),
                 Ref_Tests_Last_Ca_3_Months=sum(month_3*Screening_Tests_cancer_related),
                 Ref_Tests_Last_Ca_1_Months=sum(month_1*Screening_Tests_cancer_related))
    
    P<-summarise(group_by(P,patient_id),
                 Pres_Last_6_Months=sum(Drugs),
                 Pres_Last_3_Months=sum(month_3*Drugs),
                 Pres_Last_1_Months=sum(month_1*Drugs),
                 Pres_Count_Last_6_Months=sum(Drugs_count),
                 Pres_Count_Last_3_Months=sum(month_3*Drugs_count),
                 Pres_Count_Last_1_Months=sum(month_1*Drugs_count))
    
    Temp<-merge(Temp,L,by=c("patient_id"),all.x=T)
    Temp<-merge(Temp,R,by=c("patient_id"),all.x=T)
    Temp<-merge(Temp,P,by=c("patient_id"),all.x=T)
    
    if(y==2012&w==1){X<-Temp}else{X<-rbind(X,Temp)}
  }
}

# Add to the main Data
Data <- merge(Data,X,by=c("patient_id","year","yearweek"),all.x=T)

# Summarise missing values
mean.nas <- function(x){mean(is.na(x))}
a <- summarise_all(Data,mean.nas)

## Omit NAs
for(i in 1:length(a)){
  if(a[,i]!=0){
    v <- colnames(Data)[i]
    Data[[v]][is.na(Data[[v]])] <- 0
  }else{print(i)}
}

# Omit NAs
Data<-na.omit(Data)

# Save
saveRDS(Data,file.path(pd,"data","data_for_ps_estimation_final"))


###########################################################################


## Prepare the research data for prediction
###########################################################################

# Import
Data <- readRDS(file.path(pd,"data","data_final"))

# Fix missing variables
Data$age <- Data$year - Data$birth_date
Data$age[is.na(Data$age)] <- 52
Data$gender[is.na(Data$gender)] <- 0.5
Data$Cancer_Old[is.na(Data$Cancer_Old)] <- 0

# Add columns for recent chronic conditions
Data$TIA_Recent_3 <- difftime(as.Date(Data$date_from_tia),Data$time)<0 & difftime(as.Date(Data$date_from_tia),Data$time)>(-91)
Data$Diabetic_Recent_3 <- difftime(as.Date(Data$date_from_diabetic),Data$time)<0 & difftime(as.Date(Data$date_from_diabetic),Data$time)>(-91)
Data$CVD_Recent_3 <- difftime(as.Date(Data$date_from_cvd),Data$time)<0 & difftime(as.Date(Data$date_from_cvd),Data$time)>(-91)
Data$Hashmana_Recent_3 <- difftime(as.Date(Data$date_from_hashmana),Data$time)<0 & difftime(as.Date(Data$date_from_hashmana),Data$time)>(-91)
Data$Cancer_Old_Recent_3 <- difftime(as.Date(Data$date_from_cancer),Data$time)<0 & difftime(as.Date(Data$date_from_cancer),Data$time)>(-91)
Data$Blood_pressure_Recent_3 <- difftime(as.Date(Data$date_from_bloodpresure),Data$time)<0 & difftime(as.Date(Data$date_from_bloodpresure),Data$time)>(-91)
Data$Dializa_Recent_3 <- difftime(as.Date(Data$date_from_dializa),Data$time)<0 & difftime(as.Date(Data$date_from_dializa),Data$time)>(-91)
Data$COPD_Recent_3 <- difftime(as.Date(Data$date_from_copd),Data$time)<0 & difftime(as.Date(Data$date_from_copd),Data$time)>(-91)
Data$Osteo_Recent_3 <- difftime(as.Date(Data$date_from_osteo),Data$time)<0 & difftime(as.Date(Data$date_from_osteo),Data$time)>(-91)
Data$CHF_Recent_3 <- difftime(as.Date(Data$date_from_chf),Data$time)<0 & difftime(as.Date(Data$date_from_chf),Data$time)>(-91)
Data$CKD_Recent_3 <- difftime(as.Date(Data$date_from_ckd),Data$time)<0 & difftime(as.Date(Data$date_from_ckd),Data$time)>(-91)
Data$Fertility_Recent_3 <- difftime(as.Date(Data$date_from_fertility),Data$time)<0 & difftime(as.Date(Data$date_from_fertility),Data$time)>(-91)
Data$Cardio_Recent_3 <- difftime(as.Date(Data$date_from_cardio),Data$time)<0 & difftime(as.Date(Data$date_from_cardio),Data$time)>(-91)

# Add information on old visits and hospitalizations
X <- readRDS(file.path(pd,"data","visits_data_for_ps_estimation"))
X2 <- readRDS(file.path(pd,"data","hospitalizations_data_for_ps_estimation"))
colnames(X)[3] <- "Yearweek"
colnames(X2)[3] <- "Yearweek"
Data3 <- merge(Data[Data$Control==3,],X,by=c("patient_id","year","Yearweek"),all.x=T)
Data3 <- merge(Data3,X2,by=c("patient_id","year","Yearweek"),all.x=T)
gc()
Data0 <- merge(Data[Data$Control!=3,],X,by=c("patient_id","year","Yearweek"),all.x=T)
Data0 <- merge(Data0,X2,by=c("patient_id","year","Yearweek"),all.x=T)
Data <- rbind(Data3,Data0)
rm(Data3,Data0)

X <- NULL
X2 <- NULL
gc()

Data$Visits_Last_6_Months[is.na(Data$Visits_Last_6_Months)] <- 0
Data$Visits_Last_3_Months[is.na(Data$Visits_Last_3_Months)] <- 0
Data$Visits_Last_1_Months[is.na(Data$Visits_Last_1_Months)] <- 0
Data$Visits_Last_6_Months_PCP[is.na(Data$Visits_Last_6_Months_PCP)] <- 0
Data$Visits_Last_3_Months_PCP[is.na(Data$Visits_Last_3_Months_PCP)] <- 0
Data$Visits_Last_1_Months_PCP[is.na(Data$Visits_Last_1_Months_PCP)] <- 0

Data$Hospitalizations_Last_6_Months[is.na(Data$Hospitalizations_Last_6_Months)] <- 0
Data$Days_Hospitalizations_Last_6_Months[is.na(Data$Days_Hospitalizations_Last_6_Months)] <- 0


# Add old tests/referrals/prescriptions
Lab <- readRDS(file.path(pd,"data","lab_tests_for_ps_estimation"))
Lab$Lab_Test <- (Lab$Blood+Lab$Cholesterol+Lab$Triglycerides+Lab$Phosphatase+Lab$Alt_GPT)>0
Lab$Lab_Tests <- (Lab$Blood+Lab$Cholesterol+Lab$Triglycerides+Lab$Phosphatase+Lab$Alt_GPT)
Lab$Lab_Tests_extended <- (Lab$VitaminB12+Lab$AST_GOT+Lab$TSH+Lab$Urine+Lab$Ferritin)
Lab$Lab_Tests_cancer_related <- (Lab$PSA+Lab$Biopsy)
Lab$year <- year(Lab$date)
Lab$week <- week(Lab$date)
Lab <- Lab[Lab$Lab_Tests > 0 |Lab$Lab_Tests_extended > 0 | Lab$Lab_Tests_cancer_related > 0, 
           c("patient_id","year","week","Lab_Tests","Lab_Tests_extended","Lab_Tests_cancer_related")]
Lab <- summarise_all(group_by(Lab,patient_id,year,week),sum)
gc()

Ref <- readRDS(file.path(pd,"data","referrals_for_ps_estimation"))
Ref$Screening_Test <- (Ref$Machon+Ref$Heart_Machon+Ref$Dimut+Ref$Ultrasound+Ref$Rentgen)>0               
Ref$Screening_Tests_extended <- (Ref$Consult+Ref$ER+Ref$Density+Ref$Dufler+Ref$EKG)  
Ref$Screening_Tests_cancer_related <- (Ref$Mammo+Ref$Gastro)  
Ref$Screening_Tests <- (Ref$Machon+Ref$Heart_Machon+Ref$Dimut+Ref$Ultrasound+Ref$Rentgen)              
Ref$year <- year(Ref$date)
Ref$week <- week(Ref$date)
Ref <- Ref[Ref$Screening_Tests>0 | Ref$Screening_Tests_extended>0 | Ref$Screening_Tests_cancer_related>0,
           c("patient_id","year","week","Screening_Tests","Screening_Tests_extended","Screening_Tests_cancer_related")]
Ref <- summarise_all(group_by(Ref,patient_id,year,week),sum)


Pres <- readRDS(file.path(pd,"data","prescriptions_for_ps_estimation"))

colnames(Data)[3] <- "yearweek"
# Count old tests/pres for every patient-month
for(y in 2012:2015){
  for(w in 1:53){
    
    Temp <- Data[Data$year==y&Data$yearweek==w,c("patient_id","year","yearweek")]
    Temp <- distinct(Temp,patient_id,year,yearweek)
    # 6 months before this week 
    L <- Lab[(Lab$year==y&Lab$week>(w-25)&Lab$week<w)|
               (Lab$year==y-1&Lab$week>(27+w)),]
    
    R <- Ref[(Ref$year==y&Ref$week>(w-25)&Ref$week<w)|
               (Ref$year==y-1&Ref$week>(27+w)),]
    
    P <- Pres[(Pres$year==y&Pres$week>(w-25)&Pres$week<w)|
                (Pres$year==y-1&Pres$week>(27+w)),]
    
    
    # Mark different periods
    L$month_1 <- (L$year==y&L$week>(w-5)&L$week<w)|
      (L$year==y-1&L$week>(48+w))
    L$month_3 <- (L$year==y&L$week>(w-13)&L$week<w)|
      (L$year==y-1&L$week>(40+w))
    
    R$month_1 <- (R$year==y&R$week>(w-5)&R$week<w)|
      (R$year==y-1&R$week>(48+w))
    R$month_3 <- (R$year==y&R$week>(w-13)&R$week<w)|
      (R$year==y-1&R$week>(40+w))
    
    P$month_1 <- (P$year==y&P$week>(w-5)&P$`week`<w)|
      (P$`year`==y-1&P$`week`>(48+w))
    P$month_3 <- (P$`year`==y&P$`week`>(w-13)&P$`week`<w)|
      (P$`year`==y-1&P$`week`>(40+w))
    
    
    L <- summarise(group_by(L,patient_id),
                   Lab_Tests_Last_6_Months=sum(Lab_Tests),
                   Lab_Tests_Ex_Last_6_Months=sum(Lab_Tests_extended),
                   Lab_Tests_Ca_Last_6_Months=sum(Lab_Tests_cancer_related),
                   Lab_Tests_Last_3_Months=sum(month_3*Lab_Tests),
                   Lab_Tests_Ex_Last_3_Months=sum(month_3*Lab_Tests_extended),
                   Lab_Tests_Ca_Last_3_Months=sum(month_3*Lab_Tests_cancer_related),
                   Lab_Tests_Last_1_Months=sum(month_1*Lab_Tests),
                   Lab_Tests_Ex_Last_1_Months=sum(month_1*Lab_Tests_extended),
                   Lab_Tests_Ca_Last_1_Months=sum(month_1*Lab_Tests_cancer_related))
    
    R <- summarise(group_by(R,patient_id),
                   Ref_Tests_Last_6_Months=sum(Screening_Tests),
                   Ref_Tests_Last_3_Months=sum(month_3*Screening_Tests),
                   Ref_Tests_Last_1_Months=sum(month_1*Screening_Tests),
                   Ref_Tests_Last_Ex_6_Months=sum(Screening_Tests_extended),
                   Ref_Tests_Last_Ex_3_Months=sum(month_3*Screening_Tests_extended),
                   Ref_Tests_Last_Ex_1_Months=sum(month_1*Screening_Tests_extended),
                   Ref_Tests_Last_Ca_6_Months=sum(Screening_Tests_cancer_related),
                   Ref_Tests_Last_Ca_3_Months=sum(month_3*Screening_Tests_cancer_related),
                   Ref_Tests_Last_Ca_1_Months=sum(month_1*Screening_Tests_cancer_related))
    
    P <- summarise(group_by(P,patient_id),
                   Pres_Last_6_Months=sum(Drugs),
                   Pres_Last_3_Months=sum(month_3*Drugs),
                   Pres_Last_1_Months=sum(month_1*Drugs),
                   Pres_Count_Last_6_Months=sum(Drugs_count),
                   Pres_Count_Last_3_Months=sum(month_3*Drugs_count),
                   Pres_Count_Last_1_Months=sum(month_1*Drugs_count))
    
    Temp <- merge(Temp,L,by=c("patient_id"),all.x=T)
    Temp <- merge(Temp,R,by=c("patient_id"),all.x=T)
    Temp <- merge(Temp,P,by=c("patient_id"),all.x=T)
    
    if(y==2012&w==1){X <- Temp}else{X <- rbind(X,Temp)}
  }
}

rm(Ref,R,Lab,L,P,Pres)
gc()

# Add to the main Data
Data3 <- merge(Data[Data$Control==3,],X,by=c("patient_id","year","yearweek"),all.x=T)
Data0 <- merge(Data[Data$Control!=3,],X,by=c("patient_id","year","yearweek"),all.x=T)

Data <- rbind(Data3,Data0)
rm(Data3,Data0)
gc()

for(v in colnames(Data)[175:206]){
  Data[[v]][is.na(Data[[v]])] <- 0
}

# Save
saveRDS(Data,file.path(pd,"data","data_final_for_prediction.R"))

###########################################################################


## Prediction
###########################################################################

# Define loss function
Cross_entropy <- function(y,yhat){
  x <- sum( as.numeric(y==1) * (-log(yhat)) + as.numeric(y==0) * (-log(1-yhat)))
}

# Import
Data <- readRDS(file.path(pd,"data","data_for_ps_estimation_final"))

# Fix randomization
set.seed(1)


# Random indices to generate 6 train samples (different sizes)
sam7 <- 1:length(Data$patient_id)
for(i in 6:1){
  size <- 25*2^(i-1)
  assign(paste("sam",i,sep=""), sample(nrow(data.frame(Data[get(paste("sam",i+1,sep="")),])),size*1000))
  sample <- Data[get(paste("sam",i,sep="")),]
  t_xgb <- xgb.DMatrix(as.matrix(sample[,c("gender","age","TIA","Diabetic","CVD","Hashmana","Cancer_Old","Cardio","Fertility",
                                           "CKD","CHF","Osteo","COPD","Dializa","Blood_pressure","TIA_Recent","Diabetic_Recent",
                                           "CVD_Recent","Hashmana_Recent","Cancer_Old_Recent","Cardio_Recent","Fertility_Recent",
                                           "CKD_Recent","CHF_Recent","Osteo_Recent","COPD_Recent","Dializa_Recent",
                                           "Blood_pressure_Recent","Lab_Tests_Last_6_Months","Lab_Tests_Last_3_Months",
                                           "Lab_Tests_Last_1_Months","Ref_Tests_Last_6_Months","Ref_Tests_Last_3_Months",
                                           "Ref_Tests_Last_1_Months","Lab_Tests_Ca_Last_6_Months","Lab_Tests_Ca_Last_3_Months",
                                           "Lab_Tests_Ca_Last_1_Months","Ref_Tests_Last_Ca_6_Months","Ref_Tests_Last_Ca_3_Months",
                                           "Ref_Tests_Last_Ca_1_Months","Lab_Tests_Ex_Last_6_Months","Lab_Tests_Ex_Last_3_Months",
                                           "Lab_Tests_Ex_Last_1_Months","Ref_Tests_Last_Ex_6_Months","Ref_Tests_Last_Ex_3_Months",
                                           "Ref_Tests_Last_Ex_1_Months","Pres_Last_6_Months","Pres_Last_3_Months",
                                           "Pres_Last_1_Months","Pres_Count_Last_6_Months","Pres_Count_Last_3_Months",
                                           "Pres_Count_Last_1_Months","Visits_Last_1_Months","Visits_Last_3_Months",
                                           "Visits_Last_6_Months","Visits_Last_1_Months_PCP","Visits_Last_3_Months_PCP",
                                           "Visits_Last_6_Months_PCP","Hospitalizations_Last_6_Months",
                                           "Days_Hospitalizations_Last_6_Months")]),label=as.numeric(sample$Diagnostic_Test)-1)
  
  assign(paste("sample",i,sep=""),t_xgb)
}

# Test sample
test <- Data[-sam6,]
t_xgb <- xgb.DMatrix(as.matrix(test[,c("gender","age","TIA","Diabetic","CVD","Hashmana","Cancer_Old","Cardio","Fertility",
                                       "CKD","CHF","Osteo","COPD","Dializa","Blood_pressure","TIA_Recent","Diabetic_Recent",
                                       "CVD_Recent","Hashmana_Recent","Cancer_Old_Recent","Cardio_Recent","Fertility_Recent",
                                       "CKD_Recent","CHF_Recent","Osteo_Recent","COPD_Recent","Dializa_Recent",
                                       "Blood_pressure_Recent","Lab_Tests_Last_6_Months","Lab_Tests_Last_3_Months",
                                       "Lab_Tests_Last_1_Months","Ref_Tests_Last_6_Months","Ref_Tests_Last_3_Months",
                                       "Ref_Tests_Last_1_Months","Lab_Tests_Ca_Last_6_Months","Lab_Tests_Ca_Last_3_Months",
                                       "Lab_Tests_Ca_Last_1_Months","Ref_Tests_Last_Ca_6_Months","Ref_Tests_Last_Ca_3_Months",
                                       "Ref_Tests_Last_Ca_1_Months","Lab_Tests_Ex_Last_6_Months","Lab_Tests_Ex_Last_3_Months",
                                       "Lab_Tests_Ex_Last_1_Months","Ref_Tests_Last_Ex_6_Months","Ref_Tests_Last_Ex_3_Months",
                                       "Ref_Tests_Last_Ex_1_Months","Pres_Last_6_Months","Pres_Last_3_Months",
                                       "Pres_Last_1_Months","Pres_Count_Last_6_Months","Pres_Count_Last_3_Months",
                                       "Pres_Count_Last_1_Months","Visits_Last_1_Months","Visits_Last_3_Months",
                                       "Visits_Last_6_Months","Visits_Last_1_Months_PCP","Visits_Last_3_Months_PCP",
                                       "Visits_Last_6_Months_PCP","Hospitalizations_Last_6_Months",
                                       "Days_Hospitalizations_Last_6_Months")]),label=as.numeric(test$Diagnostic_Test)-1)


# Tune Parameters
etas<-c(0.001*2^(0:5))
gammas<-seq(0,0.5,0.1)
cols<-seq(4,60,8)/60
depths<-seq(4,18,2)

pars <- as.data.frame(list("eta"=1,"gamma"=1,"columns"=1,"depth"=1,
                           "loss"=1))
i<-0
for(e in 1:length(etas)){
  
  for(g in 1:length(gammas)){
    
    for(c in 1:length(cols)){
      
      for(d in 1:length(depths)){
        i <- i+1
        pars[i,] <- c(etas[e],gammas[g],cols[c],depths[d],1)
        
        boost <- xgboost(data=sample1,verbose = 0,
                         nrounds=500,
                         params=list(eta=etas[e],gamma=gammas[g],colsample_bytree=cols[c],max_depth=depths[d]),
                         objective="binary:logistic")
        test$Predicted<-predict(boost,newdata=t_xgb)
        # pars$loss[i]<-(sum((as.numeric(test$Predicted)-(as.numeric(test$Diagnostic_Test)-1))^2))/length(test$Predicted)
        pars$loss[i] <- Cross_entropy(((as.numeric(test$Diagnostic_Test)-1)) , test$Predicted)
        print((pars$loss[i]))
        
      }}}}

# Find optimum
opt_boost <- which(pars$loss == min(pars$loss), arr.ind=T)
eta_opt <- pars[opt_boost,1]
gamma_opt <- pars[opt_boost,2]
col_opt <- pars[opt_boost,3]
dep_opt <- pars[opt_boost,4]

# Check model performance by sample size
Mistakes_size <- rep(0,6)
for(i in 1:6){
  # Boost
  boost <- xgboost(data=get(paste("sample",i,sep="")),nrounds=500,
                   params=list(eta=eta_opt,gamma=gamma_opt,colsample_bytree=col_opt,max_depth=dep_opt),
                   objective="binary:logistic")
  test$Predicted<-predict(boost,newdata=t_xgb)
  # Mistakes_size[i]<-(sum((as.numeric(test$Predicted)-(as.numeric(test$Diagnostic_Test)-1))^2))/length(test$Predicted)
  Mistakes_size[i] <- Cross_entropy(((as.numeric(test$Diagnostic_Test)-1)),test$Predicted)
  print(Mistakes_size[i])
}

size <- 25*2^(1:6-1)
# par(mfrow=c(2,1))
plot(Mistakes_size/length(test$Predicted) ~ size, pch = 16, ylim = c(0.54,0.58), xlim = c(0,1000), col = "darkred")


# By nrounds
Mistakes_ntrees <- rep(0,10)
for(i in seq(1,10,2)){
  # Boost
  boost <- xgboost(data=sample6,nrounds=i*100,
                   params=list(eta=eta_opt,gamma=gamma_opt,colsample_bytree=col_opt,max_depth=dep_opt),
                   objective="binary:logistic")
  test$Predicted<-predict(boost,newdata=t_xgb)
  # Mistakes_ntrees[i]<-(sum((as.numeric(test$Predicted)-(as.numeric(test$Diagnostic_Test)-1))^2))/length(test$Predicted)
  Mistakes_ntrees[i] <- Cross_entropy(((as.numeric(test$Diagnostic_Test)-1)),test$Predicted)
  print(Mistakes_ntrees[i])
}

par(new=T)
plot(Mistakes_ntrees/length(test$Predicted) ~ c(1:10*100), pch = 16, ylim = c(0.54,0.58), xlim = c(0,1000),  col = "blue3", xlab = "", ylab = "")

# Save info on the models
Parameters <- as.data.frame(list("Boost_depth"=dep_opt,"Boost_Cols"=col_opt,
                                 "Boost_gamma"=gamma_opt,"Boost_eta"=eta_opt,"Boost_min_loss"=min(pars$loss)))
write.csv(Parameters,"Emotions/outputs/Prediction_details.csv")

pdf("R_mac/Stats/feelings/Prediction_details.pdf")
plot(Mistakes_size/length(test$Predicted) ~ size, pch = 16, ylim = c(0.54,0.6), xlim = c(0,1000), cex=0.7,col = "darkred")

par(new=T)
plot(Mistakes_ntrees[c(1,3,5,7,9)]/length(test$Predicted) ~ c(c(1,3,5,7,9)*100), pch = 17,cex=0.7, 
     ylim = c(0.54,0.6), xlim = c(0,1000),  col = "blue3", 
     xlab = "", ylab = "")
dev.off()



# Run a model on the train data with the optimal set of parameters
boost_fit <- xgboost(data=sample6,nrounds=500,params=list(eta=0.008,gamma=0.1,colsample_bytree=1,max_depth=4),objective="binary:logistic")

# Predict on test sample
test$px <- predict(boost_fit,newdata=t_xgb)

## Check the allignment with 45* line
q <- quantile(test$px,seq(0,1,0.01))

test$d<-1
for(i in 100:1){
  test$d[test$px < q[i]] <- (i-1)*0.01
}

plot <- summarise(group_by(test,d),mean_px=mean(px),mean_diag=mean(Diagnostic_Test==T))
plot(plot$mean_diag ~ plot$mean_px, bty = "n", xlab = "Average Prediction", xlim = c(0,0.6), ylim = c(0,0.6),
     ylab = "Average Rate of Testing", xaxt = "n",yaxt = "n", pch = 1, col = "blue3", cex = 1)
abline(0,1, col = "darkred")
axis(1,seq(0,1,0.1),pos=0)
axis(2,seq(0,1,0.1),pos=0)

# Run a regression and change the values to fit the 45 line
lm1 <- lm(as.numeric(Diagnostic_Test) ~ px, data = test)
test$final_px <- (test$px + (lm1$coefficients[1]-1)) * (lm1$coefficients[2])


###########################################################################

## Predict on the research data
###########################################################################

# Loss function
Cross_entropy <- function(y,yhat){
  x <- sum( as.numeric(y==1) * (-log(yhat)) + as.numeric(y==0) * (-log(1-yhat)))
}

# Import
Data <- readRDS(file.path(pd,"data","data_for_ps_estimation_final"))

# Fix randomization
set.seed(19)


# Sample Final training
train_xgb <- xgb.DMatrix(as.matrix(Data[,c("gender","age","TIA","Diabetic","CVD","Hashmana","Cancer_Old","Cardio","Fertility",
                                           "CKD","CHF","Osteo","COPD","Dializa","Blood_pressure","TIA_Recent","Diabetic_Recent",
                                           "CVD_Recent","Hashmana_Recent","Cancer_Old_Recent","Cardio_Recent","Fertility_Recent",
                                           "CKD_Recent","CHF_Recent","Osteo_Recent","COPD_Recent","Dializa_Recent",
                                           "Blood_pressure_Recent","Lab_Tests_Last_6_Months","Lab_Tests_Last_3_Months",
                                           "Lab_Tests_Last_1_Months","Ref_Tests_Last_6_Months","Ref_Tests_Last_3_Months",
                                           "Ref_Tests_Last_1_Months","Lab_Tests_Ca_Last_6_Months","Lab_Tests_Ca_Last_3_Months",
                                           "Lab_Tests_Ca_Last_1_Months","Ref_Tests_Last_Ca_6_Months","Ref_Tests_Last_Ca_3_Months",
                                           "Ref_Tests_Last_Ca_1_Months","Lab_Tests_Ex_Last_6_Months","Lab_Tests_Ex_Last_3_Months",
                                           "Lab_Tests_Ex_Last_1_Months","Ref_Tests_Last_Ex_6_Months","Ref_Tests_Last_Ex_3_Months",
                                           "Ref_Tests_Last_Ex_1_Months","Pres_Last_6_Months","Pres_Last_3_Months",
                                           "Pres_Last_1_Months","Pres_Count_Last_6_Months","Pres_Count_Last_3_Months",
                                           "Pres_Count_Last_1_Months","Visits_Last_1_Months","Visits_Last_3_Months",
                                           "Visits_Last_6_Months","Visits_Last_1_Months_PCP","Visits_Last_3_Months_PCP",
                                           "Visits_Last_6_Months_PCP","Hospitalizations_Last_6_Months",
                                           "Days_Hospitalizations_Last_6_Months")]),label=as.numeric(Data$Diagnostic_Test)-1)

# Boosting model
boost1 <- xgboost(data=train_xgb,nrounds=500,params=list(eta=0.008,gamma=0.1,colsample_bytree=1,max_depth=4),objective="binary:logistic")

# Import the research data
Vis<-readRDS(file.path(pd,"data","data_final_for_prediction.R"))

# Fix variables
Vis$Cancer_Old_Recent[is.na(Vis$Cancer_Old_Recent)]<-0
Vis$Cancer_Old[is.na(Vis$Cancer_Old)]<-0

for(v in colnames(Data)[68:104]){
  Vis[[v]] <- as.numeric(Vis[[v]])
  Data[[v]] <- as.numeric(Data[[v]])
}

for(v in colnames(Vis)[175:206]){
  Vis[[v]][is.na(Vis[[v]])] <- 0
}

# Predict
Vis_xg <- xgb.DMatrix(as.matrix(Vis[c("gender","age","TIA","Diabetic","CVD","Hashmana","Cancer_Old","Cardio","Fertility",
                                      "CKD","CHF","Osteo","COPD","Dializa","Blood_pressure","TIA_Recent","Diabetic_Recent",
                                      "CVD_Recent","Hashmana_Recent","Cancer_Old_Recent","Cardio_Recent","Fertility_Recent",
                                      "CKD_Recent","CHF_Recent","Osteo_Recent","COPD_Recent","Dializa_Recent",
                                      "Blood_pressure_Recent","Lab_Tests_Last_6_Months","Lab_Tests_Last_3_Months",
                                      "Lab_Tests_Last_1_Months","Ref_Tests_Last_6_Months","Ref_Tests_Last_3_Months",
                                      "Ref_Tests_Last_1_Months","Lab_Tests_Ca_Last_6_Months","Lab_Tests_Ca_Last_3_Months",
                                      "Lab_Tests_Ca_Last_1_Months","Ref_Tests_Last_Ca_6_Months","Ref_Tests_Last_Ca_3_Months",
                                      "Ref_Tests_Last_Ca_1_Months","Lab_Tests_Ex_Last_6_Months","Lab_Tests_Ex_Last_3_Months",
                                      "Lab_Tests_Ex_Last_1_Months","Ref_Tests_Last_Ex_6_Months","Ref_Tests_Last_Ex_3_Months",
                                      "Ref_Tests_Last_Ex_1_Months","Pres_Last_6_Months","Pres_Last_3_Months",
                                      "Pres_Last_1_Months","Pres_Count_Last_6_Months","Pres_Count_Last_3_Months",
                                      "Pres_Count_Last_1_Months","Visits_Last_1_Months","Visits_Last_3_Months",
                                      "Visits_Last_6_Months","Visits_Last_1_Months_PCP","Visits_Last_3_Months_PCP",
                                      "Visits_Last_6_Months_PCP","Hospitalizations_Last_6_Months",
                                      "Days_Hospitalizations_Last_6_Months")]),label=as.numeric(Vis$Diagnostic_Test)-1)
Vis$Predicted_xg<-predict(boost1,newdata=Vis_xg)

# Fit the 45* line
Vis$Predicted_xg_fit <- (Vis$Predicted_xg + (lm1$coefficients[1]-1)) * (lm1$coefficients[2])


saveRDS(Vis,file.path(pd,"data","data_final_with_prediction.R"))
#saveRDS(Vis,file.path(pd,"data","Placebo_Year","Feelings_Data_Visits_Final"))
###########################################################################


